The idea is to with minimal modification bring added value to submission.

Most of the data gets written back to the data folder as rds:

rds_list <- gsub(".rds","",list.files(path=paste0(project_wd,"/data"),pattern="rds"))
rds_list
 [1] "air_reserve"                "air_store_id_list"         
 [3] "air_store_info"             "air_visit_data"            
 [5] "air_visit_data_with_splits" "date_info"                 
 [7] "hpg_reserve"                "hpg_store_info"            
 [9] "local_competition"          "modelling_data_train"      
[11] "modelling_data_validate"    "raw_modelling_data"        
[13] "reservation_data"           "sample_submission"         
[15] "store_id_relation"         

Load them all:

for (filename in rds_list)
{assign(filename,readRDS(paste0(project_wd,"/data/",filename,".rds")))}

Get the ids in sample submission (last ten characters are the visit date of the restaurant):

sample_sub_ids <- sample_submission %>%
                  select(id) %>%
                  mutate(id=substr(id,1,nchar(id)-11)) %>%
                  unique %>%
                  unlist

There are 821 restaurants for which we want to predict visit volume.

These are all present in the modelling data, and only 18.27% is present in the relation with hpg data.

sum(sample_sub_ids %in% unique(modelling_data_train$air_store_id))/
  length(sample_sub_ids)*100
[1] 100
sum(sample_sub_ids %in% unique(store_id_relation$air_store_id))/
  length(sample_sub_ids)*100
[1] 18.2704

Since hpg is not available for all the restaurants in the submission. We could think of extending any added value of the ones present through location, and genre matching.

There are 34 genres in hpg:

unique(hpg_store_info$hpg_genre_name)
 [1] "Japanese style"               "Italian"                     
 [3] "International cuisine"        "Grilled meat"                
 [5] "Creation"                     "Shabu-shabu/Sukiyaki"        
 [7] "Korean cuisine"               "Creative Japanese food"      
 [9] "Japanese cuisine/Kaiseki"     "Seafood"                     
[11] "Japanese food in general"     "Party"                       
[13] "Okonomiyaki/Monja/Teppanyaki" "Sushi"                       
[15] "Spain Bar/Italian Bar"        "Chinese general"             
[17] "Bistro"                       "French"                      
[19] "Karaoke"                      "Pasta/Pizza"                 
[21] "Cafe"                         "Sweets"                      
[23] "Steak/Hamburger/Curry"        "Thai/Vietnamese food"        
[25] "Western food"                 "Taiwanese/Hong Kong cuisine" 
[27] "Cantonese food"               "Bar/Cocktail"                
[29] "Dim Sum/Dumplings"            "Amusement bar"               
[31] "Sichuan food"                 "Shanghai food"               
[33] "Spain/Mediterranean cuisine"  "Udon/Soba"                   

But only 14 in air

unique(air_store_info$air_genre_name)
 [1] "Italian/French"               "Dining bar"                  
 [3] "Yakiniku/Korean food"         "Cafe/Sweets"                 
 [5] "Izakaya"                      "Okonomiyaki/Monja/Teppanyaki"
 [7] "Bar/Cocktail"                 "Japanese food"               
 [9] "Creative cuisine"             "Other"                       
[11] "Western food"                 "International cuisine"       
[13] "Asian"                        "Karaoke/Party"               

Let’s check if some genres are more popular at some periods of the year

GenreDateVolumes <- air_visit_data %>%
  left_join(air_store_info,by=c("air_store_id")) %>%
  select(air_genre_name,visit_date,visitors) %>%
  mutate(visit_dt=lubridate::month(visit_date,label=T)) %>%
  group_by(visit_dt,air_genre_name) %>%
  summarise(Visits=sum(visitors))

Japanese food and bars seem to experience a drop during the last 6 months of the year, that italian/french cuisine do not seem to experience. Okonomiyaki and so tend to increase during that period.

library(plotly)
p<- GenreDateVolumes %>%
  #group_by(visit_month) %>%
  #summarise(Visits=sum(Visits)) %>%
  #filter(Visits<50000, Visits>1000) %>%
ggplot(aes(visit_dt,Visits,col=air_genre_name)) +
  geom_point() +
  labs(y="Visits",x = "Date")
p <- ggplotly(p)
We recommend that you use the dev version of ggplot2 with `ggplotly()`
Install it with: `devtools::install_github('hadley/ggplot2')`
p
#library(ggmap)
#Japan <- get_map("Japan",zoom=5)
install.packages("leaflet")
Installing package into 㤼㸱C:/Users/Fabien/Documents/R/win-library/3.4㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
also installing the dependencies 㤼㸱gridExtra㤼㸲, 㤼㸱raster㤼㸲, 㤼㸱viridis㤼㸲

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/gridExtra_2.3.zip'
Content type 'application/zip' length 1084510 bytes (1.0 MB)
downloaded 1.0 MB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/raster_2.6-7.zip'
Content type 'application/zip' length 3556628 bytes (3.4 MB)
downloaded 3.4 MB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/viridis_0.4.1.zip'
Content type 'application/zip' length 1754967 bytes (1.7 MB)
downloaded 1.7 MB

trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.4/leaflet_1.1.0.zip'
Content type 'application/zip' length 2528284 bytes (2.4 MB)
downloaded 2.4 MB
package ‘gridExtra’ successfully unpacked and MD5 sums checked
package ‘raster’ successfully unpacked and MD5 sums checked
package ‘viridis’ successfully unpacked and MD5 sums checked
package ‘leaflet’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\Fabien\AppData\Local\Temp\RtmpeAu0LR\downloaded_packages
m <- leaflet()
store_points <- air_store_info %>%
  select(latitude,longitude)
#p <- ggmap(Japan)
m <- addTiles(m)
m <- addMarkers(m, lng=store_points$longitude[1],lat=store_points$latitude[1])
#p
m
LS0tDQp0aXRsZTogIkJ1aWxkIGFuYWx5c2lzIGZyb20gVG9tJ3MiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpUaGUgaWRlYSBpcyB0byB3aXRoIG1pbmltYWwgbW9kaWZpY2F0aW9uIGJyaW5nIGFkZGVkIHZhbHVlIHRvIHN1Ym1pc3Npb24uDQoNCk1vc3Qgb2YgdGhlIGRhdGEgZ2V0cyB3cml0dGVuIGJhY2sgdG8gdGhlIGRhdGEgZm9sZGVyIGFzIHJkczoNCmBgYHtyfQ0KcmRzX2xpc3QgPC0gZ3N1YigiLnJkcyIsIiIsbGlzdC5maWxlcyhwYXRoPXBhc3RlMChwcm9qZWN0X3dkLCIvZGF0YSIpLHBhdHRlcm49InJkcyIpKQ0KcmRzX2xpc3QNCmBgYA0KDQpMb2FkIHRoZW0gYWxsOg0KYGBge3J9DQpmb3IgKGZpbGVuYW1lIGluIHJkc19saXN0KQ0Ke2Fzc2lnbihmaWxlbmFtZSxyZWFkUkRTKHBhc3RlMChwcm9qZWN0X3dkLCIvZGF0YS8iLGZpbGVuYW1lLCIucmRzIikpKX0NCmBgYA0KDQpHZXQgdGhlIGlkcyBpbiBzYW1wbGUgc3VibWlzc2lvbiAobGFzdCB0ZW4gY2hhcmFjdGVycyBhcmUgdGhlIHZpc2l0IGRhdGUgb2YgdGhlIHJlc3RhdXJhbnQpOg0KYGBge3J9DQpzYW1wbGVfc3ViX2lkcyA8LSBzYW1wbGVfc3VibWlzc2lvbiAlPiUNCiAgICAgICAgICAgICAgICAgIHNlbGVjdChpZCkgJT4lDQogICAgICAgICAgICAgICAgICBtdXRhdGUoaWQ9c3Vic3RyKGlkLDEsbmNoYXIoaWQpLTExKSkgJT4lDQogICAgICAgICAgICAgICAgICB1bmlxdWUgJT4lDQogICAgICAgICAgICAgICAgICB1bmxpc3QNCmBgYA0KDQpUaGVyZSBhcmUgYHIgbGVuZ3RoKHNhbXBsZV9zdWJfaWRzKWAgcmVzdGF1cmFudHMgZm9yIHdoaWNoIHdlIHdhbnQgdG8gcHJlZGljdCB2aXNpdCB2b2x1bWUuDQoNClRoZXNlIGFyZSBhbGwgcHJlc2VudCBpbiB0aGUgbW9kZWxsaW5nIGRhdGEsIGFuZCBvbmx5IDE4LjI3JSBpcyBwcmVzZW50IGluIHRoZSByZWxhdGlvbiB3aXRoIGhwZyBkYXRhLg0KYGBge3J9DQpzdW0oc2FtcGxlX3N1Yl9pZHMgJWluJSB1bmlxdWUobW9kZWxsaW5nX2RhdGFfdHJhaW4kYWlyX3N0b3JlX2lkKSkvDQogIGxlbmd0aChzYW1wbGVfc3ViX2lkcykqMTAwDQpzdW0oc2FtcGxlX3N1Yl9pZHMgJWluJSB1bmlxdWUoc3RvcmVfaWRfcmVsYXRpb24kYWlyX3N0b3JlX2lkKSkvDQogIGxlbmd0aChzYW1wbGVfc3ViX2lkcykqMTAwDQpgYGANCg0KU2luY2UgaHBnIGlzIG5vdCBhdmFpbGFibGUgZm9yIGFsbCB0aGUgcmVzdGF1cmFudHMgaW4gdGhlIHN1Ym1pc3Npb24uIFdlIGNvdWxkIHRoaW5rIG9mIGV4dGVuZGluZyBhbnkgYWRkZWQgdmFsdWUgb2YgdGhlIG9uZXMgcHJlc2VudCB0aHJvdWdoIGxvY2F0aW9uLCBhbmQgZ2VucmUgbWF0Y2hpbmcuDQoNClRoZXJlIGFyZSAzNCBnZW5yZXMgaW4gaHBnOg0KYGBge3J9DQp1bmlxdWUoaHBnX3N0b3JlX2luZm8kaHBnX2dlbnJlX25hbWUpDQpgYGANCg0KQnV0IG9ubHkgMTQgaW4gYWlyDQpgYGB7cn0NCnVuaXF1ZShhaXJfc3RvcmVfaW5mbyRhaXJfZ2VucmVfbmFtZSkNCmBgYA0KDQpMZXQncyBjaGVjayBpZiBzb21lIGdlbnJlcyBhcmUgbW9yZSBwb3B1bGFyIGF0IHNvbWUgcGVyaW9kcyBvZiB0aGUgeWVhcg0KYGBge3J9DQpHZW5yZURhdGVWb2x1bWVzIDwtIGFpcl92aXNpdF9kYXRhICU+JQ0KICBsZWZ0X2pvaW4oYWlyX3N0b3JlX2luZm8sYnk9YygiYWlyX3N0b3JlX2lkIikpICU+JQ0KICBzZWxlY3QoYWlyX2dlbnJlX25hbWUsdmlzaXRfZGF0ZSx2aXNpdG9ycykgJT4lDQogIG11dGF0ZSh2aXNpdF9kdD1sdWJyaWRhdGU6Om1vbnRoKHZpc2l0X2RhdGUsbGFiZWw9VCkpICU+JQ0KICBncm91cF9ieSh2aXNpdF9kdCxhaXJfZ2VucmVfbmFtZSkgJT4lDQogIHN1bW1hcmlzZShWaXNpdHM9c3VtKHZpc2l0b3JzKSkNCg0KYGBgDQoNCkphcGFuZXNlIGZvb2QgYW5kIGJhcnMgc2VlbSB0byBleHBlcmllbmNlIGEgZHJvcCBkdXJpbmcgdGhlIGxhc3QgNiBtb250aHMgb2YgdGhlIHllYXIsIHRoYXQgaXRhbGlhbi9mcmVuY2ggY3Vpc2luZSBkbyBub3Qgc2VlbSB0byBleHBlcmllbmNlLiBPa29ub21peWFraSBhbmQgc28gdGVuZCB0byBpbmNyZWFzZSBkdXJpbmcgdGhhdCBwZXJpb2QuDQpgYGB7cn0NCmxpYnJhcnkocGxvdGx5KQ0KcDwtIEdlbnJlRGF0ZVZvbHVtZXMgJT4lDQogICNncm91cF9ieSh2aXNpdF9tb250aCkgJT4lDQogICNzdW1tYXJpc2UoVmlzaXRzPXN1bShWaXNpdHMpKSAlPiUNCiAgI2ZpbHRlcihWaXNpdHM8NTAwMDAsIFZpc2l0cz4xMDAwKSAlPiUNCmdncGxvdChhZXModmlzaXRfZHQsVmlzaXRzLGNvbD1haXJfZ2VucmVfbmFtZSkpICsNCiAgZ2VvbV9wb2ludCgpICsNCiAgbGFicyh5PSJWaXNpdHMiLHggPSAiRGF0ZSIpDQpwIDwtIGdncGxvdGx5KHApDQpwDQpgYGANCg0KYGBge3J9DQojbGlicmFyeShnZ21hcCkNCiNKYXBhbiA8LSBnZXRfbWFwKCJKYXBhbiIsem9vbT01KQ0KbGlicmFyeShsZWFmbGV0KQ0KYGBgDQoNCmBgYHtyfQ0KbSA8LSBsZWFmbGV0KCkNCg0Kc3RvcmVfcG9pbnRzIDwtIGFpcl9zdG9yZV9pbmZvICU+JQ0KICBzZWxlY3QobGF0aXR1ZGUsbG9uZ2l0dWRlKQ0KDQojcCA8LSBnZ21hcChKYXBhbikNCm0gPC0gYWRkVGlsZXMobSkNCm0gPC0gYWRkTWFya2VycyhtLCBsbmc9c3RvcmVfcG9pbnRzJGxvbmdpdHVkZVsxXSxsYXQ9c3RvcmVfcG9pbnRzJGxhdGl0dWRlWzFdKQ0KI3ANCm0NCg0KYGBgDQoNCg==